import pandas as pd
import numpy as np
from plotly import graph_objects as go
import plotly.express as pxSTRESS-DES studies
This notebook standardises the coding of the STRESS sections in use and displays an overall summary chart.
TM Notes for issues with current data:
- The coding of “met” or “not” needs to be double checked by a separate coder.
- Coding should be converted to “fully”, “partial”, “not” and “NA”. In line with STARS WP1.
- TM to discuss with FA. Noticed that system specification is met quite high. This is very different from STARS WP1 (where only 12.5% of 8 studies met it compared to nearly 80% in the large sample.)
This notebook creates 24 new columns in the dataset representing the 24 STRESS checklist items. It maps the coding of article use of STRESS subsections extracted from the publications to these columns.
1. Imports
1.1 Standard imports
1.2. Review pipeline imports
from data_pipeline import load_review_dataset, filter_to_application_studies2. Constants and notebook level variables
Code
METHOD = 'DES'
sections = ['stress_objectives', 'stress_logic', 'stress_data', 'stress_exp',
'stress_imp', 'stress_code']
# STRESS checklist items by section
section1 = [
'1_1_purpose',
'1_2_outputs',
'1_3_aims'
]
section2 = [
'2_1_diagram',
'2_2_base_logic',
'2_3_scenarios',
'2_4_algorithms',
'2_5_1_entities',
'2_5_2_activities',
'2_5_3_resources',
'2_5_4_queues',
'2_5_5_entryexit'
]
section3 = [
'3_1_datasources',
'3_2_preprocessing',
'3_3_inputs',
'3_4_assumptions',
]
section4 = [
'4_1_initialisation',
'4_2_runlength',
'4_3_estimation'
]
section5 = [
'5_1_language',
'5_2_random',
'5_3_execution',
'5_4_system'
]
section6 = [
'6_1_sharing'
]
# all checklist items as a single list
new_columns = section1 + section2 + section3 + section4 + section5 + section6
# section checklist as a string
section1_str = ';'.join(section1)
section2_str = ';'.join(section2)
section3_str = ';'.join(section3)
section4_str = ';'.join(section4)
section5_str = ';'.join(section5)
section6_str = section6[0]3. Helper functions
There are two help functions in this notebook:
construct_section_str: used to create a standardised string representation of which checklist items have been appliedeval_chart: The function creates aplotlypercentage bar chart for display results. This function was authored by Amy Heather and sourced from STARS WP1 summary
Code
def construct_section_str(section_headers: list[str], to_include: list[int]) -> str:
'''
Create a standardised string representation of the checklist items that
have been used.
Parameters:
-----------
section_headers: list[str]
the checklist items represented as column headers for a pandas dataframe
e.g. ['1_1_purpose', '1_2_outputs', '1_3_aims']
to_include: list[int]
indexes of the checklist items to return as a single string.
e.g. [0, 2] would return '1_1_purpose;1_3_aims'
Returns:
-------
out: str
'''
str_to_return = ""
for item in to_include:
str_to_return = f'{str_to_return};{section_headers[item]}'
# return str dropping first instance of ";"
return str_to_return[1:]Code
def eval_chart(df):
'''
Create a stacked bar chart presenting the results from evaluation for
each study.
Parameters:
-----------
df : dataframe
Wide dataframe where columns are result of evaluation, and rows
are the study
'''
eval = (df
.melt(ignore_index=False)
.reset_index()
.rename(columns={'index': 'guideline',
'variable': 'result',
'value': 'count'}))
# Add percentages
eval['total'] = eval['count'].groupby(eval['guideline']).transform('sum')
eval['percent'] = eval['count'] / eval['total']
eval['percentage'] = round(eval['percent']*100, 1).astype(str) + '%'
# Create stacked bar visualisation
fig = px.bar(
eval,
x='percent',
y='guideline',
color='result',
color_discrete_map={'fully': '#06a94d',
'partially': '#ffd68c',
'not': '#ff9999',
'na': '#d1dcea'},
orientation='h',
hover_data={
'count': True,
'percent': False,
'percentage': True,
'guideline': False,
'result': False})
# Amend x axis label and ticks
fig.update_layout(xaxis=dict(
range=[0, 1],
tickmode='array',
tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1],
ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
title=''))
# Amend y axis label and order, and add space between ticks and plot
fig.update_layout(yaxis=dict(
autorange='reversed',
title=''))
fig.update_yaxes(ticksuffix=' ')
# Relabel legend
fig.update_layout(legend_title_text='Result')
newnames = {'fully': 'Fully met', 'partially': 'Partially met',
'not': 'Not met', 'na': 'Not applicable'}
fig.for_each_trace(lambda t: t.update(name=newnames[t.name]))
# Show without toolbar
fig.show(config={'displayModeBar': False})4. Recoding dictionary
This contains the mapping of the extracted data to the standarised codes used in the analysis. The mapping is stored in a nested dictionary translations. The main key is one of the six sections in the STRESS review dataset:
- stress_objectives
- stress_logic
- stress_data
- stress_exp
- stress_imp
- stress_code
Code
translations = {
'stress_objectives':
{
'Yes': section1_str,
'Purpose': '1_1_purpose',
'All but aims of experimentations': ';'.join(section1[:1]),
'Purpose, model outputs': ';'.join(section1[:1]),
'Purpose, aim of experimentation': f'{section1[0]};{section1[2]}'
},
'stress_logic':
{
'Yes': section2_str,
'Model logic, and components': f"{section2[1]};{';'.join(section2[4:])}",
'Base model logic and diagram, scenario logic, algorithms': ';'.join(section2[:4]),
'Base model diagram, base model logic, scenario logic, algorithms': ';'.join(section2[:4]),
'Base model logic and sceario logic': ';'.join(section2[1:3]),
'Base model logic, base model overview diagram, algorithms, and components': construct_section_str(section2, [0, 1, 3, 4, 5, 6, 7, 8]),
'Base model overview diagram and logic, scenario logic': construct_section_str(section2, [0, 1, 2]),
'Base model logic and diagram': construct_section_str(section2, [0, 1]),
'Base model overview, scenario logic, components': construct_section_str(section2, [0, 5, 6, 7, 8]),
},
'stress_data':
{
'Yes': section3_str,
'Assumptions, input parameters': construct_section_str(section3, [2, 3]),
'Data sources, input parameters': construct_section_str(section3, [0, 2]),
'All but assumptions': construct_section_str(section3, [0, 1, 2]),
'All but preprocessing': construct_section_str(section3, [0, 2, 3]),
'Input parameters': construct_section_str(section3, [2]),
'All but assumption': construct_section_str(section3, [0, 1, 2]),
'Data sources, assumptions': construct_section_str(section3, [0, 3]),
'Input parameters and assumptions': construct_section_str(section3, [2, 3]),
},
'stress_exp':
{
'Yes': section4_str,
'Initialisation, run length': construct_section_str(section4, [0, 1]),
'Estimation approach': construct_section_str(section4, [2]),
'Run length': construct_section_str(section4, [1]),
'All but initialisation': construct_section_str(section4, [1, 2]),
},
'stress_imp':
{
'Yes': section5_str,
'Software, system specification': construct_section_str(section5, [0, 3]),
'Software, random sampling,': construct_section_str(section5, [0, 1]),
'All but model execution': construct_section_str(section5, [0, 1, 3]),
'All but software': construct_section_str(section5, [1, 2, 3]),
'All but system spec': construct_section_str(section5, [0, 1, 2]),
'Software': construct_section_str(section5, [0]),
'Software, random sampling': construct_section_str(section5, [0, 1,]),
},
'stress_code':
{
'Yes': section6_str
}
}5. Filter to DES studies
As we are working with STRESS-DES we limit empirical studies to DES only.
des_studies = (
# load the review dataset
load_review_dataset()
# filter to studies used
.pipe(filter_to_application_studies)
# filter to DES only
.query(f'method == "{METHOD}"')
)6. Recode and process
- Create the 24 new checklist item columns. By default all criteria is failed (“No”)
- Map extracted data to standardised codes
- Populate the checklist columns
- Calculate totals for criteria met or not.
6.1 Create checklist columns
# 1. Create the 24 new checklist item columns. By default all criteria is failed ("No")
for col in new_columns:
des_studies[col] = "No"6.2. Map extracted data to standardised codes
des_studies = (des_studies.replace(translations)
.fillna({'stress_objectives': "None",
'stress_logic': "None",
'stress_data': "None",
'stress_exp': "None",
'stress_imp': "None",
'stress_code': "None"})
)6.3 Populate the checklist columns
# code to populate subsection columns
section_subsections = [section1, section2, section3, section4, section5,
section6]
for sec, subsecs in zip(sections, section_subsections):
for subsec in subsecs:
mask = des_studies[sec].str.contains(subsec)
# update the subsection column to Yes for all rows that include it
des_studies.loc[mask, subsec] = "Yes"6.4 Calculate totals by criteria
col = ['fully', 'partially', 'not', 'na']
criteria_dict = {}
for subsection in new_columns:
counts = des_studies[subsection].value_counts().tolist()
# at the moment we only have met or not... missing partially or NA
fullfillment = [counts[0], 0, counts[1], 0]
criteria_dict[subsection] = fullfillment
criteria_wide = pd.DataFrame(criteria_dict, index=col).T7. Results
Note at this time the review needs to be revisited to estimate fully met versus partially.
eval_chart(criteria_wide)